library(renv)
renv::load()
renv::restore() # Install missing librairies.
renv::status() # Check the project state.Modelling eel abundance in the Vilaine bassin
0. Setup
Load dependencies.
library(zen4R)
library(here)Download data from Zenodo. The data file is relatively heavy (~800Mb), this operation can take a few minutes.
doi <- "10.5281/zenodo.17962542"
dest_dir <- here("data")if (!dir.exists(dest_dir)) dir.create(dest_dir, recursive = TRUE)
download_zenodo(doi, path = dest_dir)
unzip(here(dest_dir, "miste_data.zip"), exdir = dest_dir)If everything went well, the data should have been downloaded in data/zenodo/.
1. River data
First, we focus on the river geographic data. Let’s load it.
base::load(here(dest_dir, "zenodo", "processed_data", "reseaux_hydrographiques.rda"))For now, we will focus on the Vilaine bassin. We can view it using sfnetworks.
library(sf)
library(sfnetworks)
net <- as_sfnetwork(rht_vilaine)
plot(net, cex = 0.5)We can see that the hydrographic network is not entirely connected. For example, a small portion in the south west is disconnected from the rest of the bassin. We should keep this in mind for later, as it could bias our statistical model.
We want to check that the network data is valid. In particular, we want to verify that there is no duplicate in the nodes, or edges data.
nodes <- net |>
activate("nodes") |>
st_as_sf()
sum(duplicated(nodes))[1] 0
library(dplyr)
library(tidyr)
edges <- net |>
activate("edges") |>
st_as_sf()
sum(duplicated(select(edges, c("from", "to"))))[1] 0
We see that there is no duplication for the Vilaine network. We can proceed to the next step.
2. Environmental data
2.1 Load data
base::load(here(dest_dir, "zenodo", "processed_data", "poissons_env_temp.RData"))
env# A tibble: 5,777 × 14
sta_id pop_id ope_id ope_date annee distance_mer altitude
<int> <int> <int> <dttm> <dbl> <int> <int>
1 9655 37720 22622 1996-09-24 10:09:00 1996 NA 305
2 9225 35420 16755 1998-06-19 09:30:00 1998 NA 80
3 9622 37544 16012 2000-05-22 14:45:00 2000 328 98
4 9762 38357 17091 2006-09-28 14:00:00 2006 216 31
5 9384 36157 17349 2006-06-21 14:30:00 2006 482 140
6 9712 38066 17621 2016-09-08 10:30:00 2016 400 130
7 9454 36580 17379 2010-09-14 15:00:00 2010 500 124
8 9520 36941 17079 2007-06-14 14:30:00 2007 224 37
9 9381 36135 17154 2006-06-20 00:00:00 2006 387 100
10 9284 35724 17596 2016-09-29 10:00:00 2016 212 32
# ℹ 5,767 more rows
# ℹ 7 more variables: surface_bv <dbl>, distance_source <dbl>, largeur <dbl>,
# pente <dbl>, profondeur <dbl>, temp_juillet <dbl>, temp_janvier <dbl>
Let’s rename column for clarity.
env <- env |> rename(year = annee)Station geographical coordinates are stored in points_geo_<river>. We visualize them over the hydrographic network.
plot(net, col = "black", cex = 0.5)
plot(points_geo_vilaine, col = "red", add = TRUE, pch = 16, cex = 0.5)2.2. Visualisation of altitude data
To begin with, let’s focus on a single environment variable: the altitude.
d_env <- env |>
select(c("sta_id", "pop_id", "altitude")) |>
distinct()
d_env |> filter(is.na(altitude))# A tibble: 2 × 3
sta_id pop_id altitude
<int> <int> <int>
1 8853 33379 NA
2 13994 51300 NA
We see that we have missing values for two stations: 8853 and 13994. Next, we want to relate altitude values to geographical coordinates.
pts_vilaine <- points_geo_vilaine |>
select(c("pop_id", "geometry")) |>
left_join(d_env, by = "pop_id") |>
filter(!is.na(altitude))
pts_vilaineSimple feature collection with 37 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -2.965245 ymin: 47.46923 xmax: -1.051178 ymax: 48.3493
Geodetic CRS: WGS 84
First 10 features:
pop_id sta_id altitude geometry
1 46885 12253 25 POINT (-2.429369 47.71654)
2 46166 11854 176 POINT (-2.965245 48.32017)
3 47601 12434 11 POINT (-1.796815 47.46923)
4 46671 12118 9 POINT (-2.332695 47.76529)
5 47659 12447 3 POINT (-2.142875 47.5878)
6 47613 12436 5 POINT (-1.867647 47.70129)
7 46484 12007 41 POINT (-2.35679 48.01553)
8 47460 12407 48 POINT (-1.410521 47.71718)
9 47677 12451 16 POINT (-2.563233 47.59471)
10 47326 12380 30 POINT (-1.562385 48.03841)
We see that the altitude is not measured on every sampling points, but only in 37 locations. Let’s plot them.
library(ggplot2)
set_theme(theme_minimal())
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
ggplot() +
geom_sf(data = points_geo_vilaine, color = "grey80", alpha = 0.5) +
geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
labs(
color = "Altitude (m)"
)We see in grey every location were we have sample of fish abundances, and in colour where we have altitude data. We have to interpolate altitude data for missing locations. The simplest way to do it is to the “nearest neighbour” interpolation.
2.3. Interpolation of altitude data
We first build proximity polygons with the terra::voronoi function.
library(terra)
d_elev <- select(pts_vilaine, c("geometry", "altitude"))
d_elev.vect <- terra::vect(d_elev)
v <- voronoi(d_elev.vect)
plot(v, "altitude")
points(d_elev.vect)v.sf <- st_as_sf(v)
pred <- st_intersection(v.sf, points_geo_vilaine)
d_elev.nn <- select(pred, c("altitude", "pop_id", "geometry"))
ggplot() +
geom_sf(data = pred, aes(color = altitude)) +
geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
labs(
color = "Altitude (m)"
)Predicted altitudes seem consistent with the nearest neighbour interpolation.
2.4. Cross-validation of altitude interpolation
Now, that we have a prediction pipeline working we will estimate its performance using cross-validation. To do so, we will do a LOOCV (Leave-One-Out Cross-Validation).
n <- nrow(d_elev)
preds <- numeric(n) # Vector filled with 0s.
for (i in 1:n) {
train <- d_elev[-i, ]
test <- d_elev[i, ]
v_train <- voronoi(terra::vect(train))
preds[i] <- st_intersection(st_as_sf(v_train), test)$altitude
}
d_elev$altitude.loo <- preds
ggplot(d_elev, aes(x = altitude, y = altitude.loo)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(
x = "Observed altitude (m)",
y = "Predicted altitude (m)"
)rmse <- sqrt(mean((d_elev$altitude - d_elev$altitude.loo)^2))
rmse[1] 18.74797
3. Abundance data
3.1. Load data
catches <- catches |>
rename(
species = esp_code_alternatif,
year = annee,
abundance = effectif
)
catches# A tibble: 69,823 × 6
sta_id pop_id ope_id year species abundance
<int> <int> <int> <dbl> <chr> <int>
1 8530 31870 19943 2017 TRF 159
2 8530 31870 19943 2017 VAI 82
3 8530 31870 19944 2011 TRF 217
4 8530 31870 19944 2011 VAI 7
5 8530 31870 19945 2009 TRF 232
6 8530 31870 19945 2009 VAI 9
7 8530 31870 19946 2007 TRF 213
8 8530 31870 19946 2007 VAI 36
9 8530 31870 83029 2019 TRF 126
10 8530 31870 83029 2019 VAI 104
# ℹ 69,813 more rows
species_names <- unique(catches$species)Next, let’s add zeros where species are not found.
catches <- catches |>
pivot_wider(
names_from = species,
values_from = abundance,
values_fill = list(abundance = 0)
) |>
pivot_longer(
cols = species_names,
names_to = "species",
values_to = "abundance"
)
catches# A tibble: 450,606 × 6
sta_id pop_id ope_id year species abundance
<int> <int> <int> <dbl> <chr> <int>
1 8530 31870 19943 2017 TRF 159
2 8530 31870 19943 2017 VAI 82
3 8530 31870 19943 2017 LOF 0
4 8530 31870 19943 2017 CHA 0
5 8530 31870 19943 2017 SDF 0
6 8530 31870 19943 2017 PFL 0
7 8530 31870 19943 2017 BAF 0
8 8530 31870 19943 2017 CHE 0
9 8530 31870 19943 2017 GOU 0
10 8530 31870 19943 2017 OBR 0
# ℹ 450,596 more rows
3.2. Preliminary plots
Let’s plot abundance histogram for each species. As most species abundance are dominated by zeros, because of absence data, we plot log(abundance + 1) instead of raw abundances. This also implies that we will need to use zero-inflated distributions in our statistical model to account for this absence data.
catches |>
ggplot(aes(x = abundance + 1)) + # add 1 to avoid log(0)
geom_histogram(binwidth = 0.5) +
scale_x_log10() +
facet_wrap(~species)Let’s plot abundance time series for the 10 most abundant species.
n_species <- 10
common_species <- catches |>
group_by(species) |>
summarise(mean_abundance = mean(abundance)) |>
arrange(desc(mean_abundance)) |>
slice(1:n_species) |>
pull(species)
catches |>
filter(species %in% common_species) |>
group_by(year, species) |>
summarise(mean_abundance = mean(abundance)) |>
ggplot(aes(x = year, y = mean_abundance, colour = species)) +
geom_line()There is no clear trend, but we can’t say much at this point because trend may be masked by sampling efforts and environmental covariates.
Here it would be interesting to plot the number of operations per year to see the evolution of the sampling effort.
d_eel <- catches |> filter(species == "ANG")
d_eel.avg <- d_eel |>
group_by(pop_id) |>
summarise(abundance_mean = mean(abundance))
data <- inner_join(d_elev.nn, d_eel.avg, by = "pop_id")
ggplot(data, aes(x = altitude, y = abundance_mean)) +
geom_point() +
labs(
x = "Altitude (m)",
y = "Eel abundance"
)We clearly observe the expected trend of eel abundance with altitude.
Next, we want to model the effect of altitude, while accounting for time using INLA.
4. Statistical models
Data is ready to model eel abundances in space, time and with altitude. In this section, we will build increasingly complex models.
Before doing anything fancy, let’s load INLA, and scale our predictor variable (altitude) as it is good practice.
library(INLA)
data <- inner_join(d_eel, d_elev.nn, by = "pop_id")
data$altitude_s <- scale(data$altitude)4.1. Abundance distribution and overdisperion
Here, we focus on the choice of the distribution of our target variable: eel abundance. We will focus particularly on overdispersion, as we know that eel distribution is zero-inflated due to absence data. For the following model, we add a random effect for stations and year. We will discuss in further details these points in following sections.
4.1.1. Poisson model
We begin with a poisson GLM.
m.poisson <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "iid") +
f(pop_id, model = "iid"),
family = "poisson",
data = data,
control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.poisson)Time used:
Pre = 0.753, Running = 0.273, Post = 0.118, Total = 1.14
Fixed effects:
mean sd 0.025quant 0.5quant 0.975quant mode kld
(Intercept) 1.921 0.165 1.594 1.922 2.244 1.922 0
altitude_s -1.404 0.169 -1.739 -1.403 -1.075 -1.403 0
Random effects:
Name Model
year IID model
pop_id IID model
Model hyperparameters:
mean sd 0.025quant 0.5quant 0.975quant mode
Precision for year 10.86 2.719 6.395 10.57 17.03 10.03
Precision for pop_id 1.26 0.331 0.723 1.22 2.02 1.15
Deviance Information Criterion (DIC) ...............: 4552.01
Deviance Information Criterion (DIC, saturated) ....: 2685.27
Effective number of parameters .....................: -737.88
Watanabe-Akaike information criterion (WAIC) ...: 10992.89
Effective number of parameters .................: 2810.03
Marginal log-Likelihood: -3209.39
is computed
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
We can plot the distribution of the fixed effect (altitude).
marginal.altitude <- m.poisson$marginals.fixed$altitude_s
ggplot(as.data.frame(marginal.altitude)) +
geom_line(aes(x = x, y = y)) +
labs(
x = "Altitude effect size",
y = "Probability"
)We can compute the highest posterior density (HDP) credible interval.
inla.hpdmarginal(0.97, m.poisson$marginals.fixed$altitude_s) low high
level:0.97 -1.774432 -1.037384
We can also sample from the posterior distribution.
m.poisson.samples <- inla.posterior.sample(100, m.poisson)We represent the sampled effect size of altitude against the marginal distribution obtained before.
fun <- function(...) {
altitude_s
}
altitude.sample <- inla.posterior.sample.eval(fun, m.poisson.samples)
tab <- data.frame(altitude = altitude.sample[1, ])
ggplot(tab, aes(x = altitude)) +
geom_histogram(aes(y = after_stat(density)), bins = 18) +
geom_line(data = as.data.frame(marginal.altitude), aes(x = x, y = y)) +
labs(
x = "Altitude effect size",
y = "Density"
)Next, we want to simulate data from the model. This step is important as it allows us to see if our model makes sense or not. In particular, we identify where our model fails and improve it.
alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
fun = function(...) {
mu <- exp(Intercept + altitude_s * alt_vals)
abundance <- rpois(length(mu), mu)
},
samples = m.poisson.samples
)
library(bayestestR)
y.sum <- apply(y.sample, 1, function(x) {
y.hdi <- hdi(x, ci = 0.89)
c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
altitude_s = alt_vals,
t(y.sum)
)
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
# geom_point(data = data, aes(x = altitude_s, y = abundance), colour = "grey80") +
geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
geom_line(colour = "steelblue") +
labs(
x = "Scaled altitude",
y = "Eel abundance"
)Next, we want to plot actual data against this prediction. Because we have not included spatial and temporal random effects while sampling the posterior, we are making predictions for an average site and year. Therefore, we want to average abundance data over site and year.
data.avg <- data |>
group_by(altitude_s) |>
summarise(
abundance_avg = mean(abundance),
abundance_var = var(abundance)
)
data.avg# A tibble: 32 × 3
altitude_s[,1] abundance_avg abundance_var
<dbl> <dbl> <dbl>
1 -0.979 13.0 52.4
2 -0.954 35.1 416.
3 -0.929 50.2 1692.
4 -0.904 30.3 235.
5 -0.830 43.1 2204.
6 -0.780 50.1 1004.
7 -0.705 37.6 350.
8 -0.680 42.4 624.
9 -0.655 26.6 332.
10 -0.630 9.44 15.3
# ℹ 22 more rows
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
geom_line(colour = "steelblue") +
labs(
x = "Scaled altitude",
y = "Eel abundance"
)We see that our model capture the clear decreasing trend of eel abundances with altitude.
Let’s investigate overdispersion with a posterior predictive check.
y_pred_avg <- apply(y.sample, 1, mean)
y_pred_var <- apply(y.sample, 1, var)
pred_df <- data.frame(
abundance_avg = y_pred_avg,
abundance_var = y_pred_var
)
pred_df$type <- "Posterior predictive"
pred_df$altitude_s <- alt_vals
data.avg$type <- "Observed"
plot_df <- rbind(pred_df, data.avg)
ggplot(plot_df, aes(x = abundance_avg, y = abundance_var, colour = type)) +
geom_point() +
geom_abline(slope = 1, intercept = 1) +
labs(
x = "Observed mean abundance",
y = "Observed variance in abundace"
)We see clearly that observed data is overdispersed (variance >> mean) compared to the data generated by our model. This suggest that the poisson distribution is not suited for this task.
4.1.2. Negative binomial model
To account for overdispersion, we will test the negative binomial distribution. Basically, we will repeat the previous section but with a model where poisson distribution is replaced with a negative binomial one.
m.nbinom <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "iid") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.nbinom)Time used:
Pre = 0.459, Running = 0.267, Post = 0.0629, Total = 0.788
Fixed effects:
mean sd 0.025quant 0.5quant 0.975quant mode kld
(Intercept) 2.036 0.158 1.721 2.037 2.344 2.037 0
altitude_s -1.412 0.164 -1.737 -1.412 -1.092 -1.412 0
Random effects:
Name Model
year IID model
pop_id IID model
Model hyperparameters:
mean sd 0.025quant
size for the nbinomial observations (1/overdispersion) 2.18 0.185 1.836
Precision for year 18.59 8.697 7.432
Precision for pop_id 1.46 0.411 0.806
0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion) 2.17 2.56
Precision for year 16.73 40.80
Precision for pop_id 1.41 2.41
mode
size for the nbinomial observations (1/overdispersion) 2.16
Precision for year 13.62
Precision for pop_id 1.31
Deviance Information Criterion (DIC) ...............: 3440.05
Deviance Information Criterion (DIC, saturated) ....: 622.13
Effective number of parameters .....................: 53.89
Watanabe-Akaike information criterion (WAIC) ...: 3443.63
Effective number of parameters .................: 50.78
Marginal log-Likelihood: -1783.55
is computed
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
Let’s plot the model prediction against actual data.
m.nbinom.samples <- inla.posterior.sample(1000, m.nbinom)
alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
fun = function(...) {
mu <- exp(Intercept + altitude_s * alt_vals)
abundance <- rnbinom(length(mu), mu = mu, size = theta[1])
},
samples = m.nbinom.samples
)
y.sum <- apply(y.sample, 1, function(x) {
y.hdi <- hdi(x, ci = 0.89)
c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
altitude_s = alt_vals,
t(y.sum)
)
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
geom_line(colour = "steelblue") +
labs(
x = "Scaled altitude",
y = "Eel abundance"
)We see that the negative binomial distribution better account for the variance. We will stick to this distribution for now, although we could have tried other distributions. Notably, the ZIP-poisson could be useful when we have many 0s.
4.2. Modeling time
Let’s now focus on modelling the effect of time. To begin with, let’s inspect the raw data.
ggplot(data, aes(x = year, y = abundance, color = altitude_s)) +
geom_jitter(width = 0.2) +
stat_summary(fun = mean, geom = "line", colour = "grey", size = 1) +
labs(x = "Year", ylab = "Eel abundance", color = "Altitude (scaled)")The plain line indicates the trend of the mean abundance over all sites. This trend is decreasing, we expect to retrieve this trend in our temporal random effects.
4.2.1. IID random effect
We begin by assuming that year random effects are independent from one another. We expect this assumption to be wrong, but let’s start from there and then explore more realistic modeling choices.
m.iid <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "iid") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE)
)
re.year.iid <- m.iid$summary.random$yearHere was our first try, yet we expect some correlation from one year to another. For this reason, time random effect are often modelled with autoregressive model (AR1 if lag 1).
4.2.2. AR1 Random effect
We modify the previous model changing the year random effect from IID to AR1.
m.ar <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "ar1") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE)
)
re.year.ar <- m.ar$summary.random$year4.2.3. RW1 random effect
Same with RW1 random effect.
m.rw <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "rw1") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE)
)
re.year.rw <- m.rw$summary.random$year4.2.4. Model comparisons
Now that we have the three models, we can compare them.
First, we can plot random effect vs year.
re.year.iid$type <- "iid"
re.year.ar$type <- "ar"
re.year.rw$type <- "rw"
re.year.all <- rbind(re.year.iid, re.year.ar, re.year.rw)
ggplot(re.year.all, aes(x = ID, y = mean, color = type)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`), alpha = 0.1) +
geom_line() +
geom_point() +
labs(
x = "Year",
y = "Year random effect",
)We see that AR1 and RW1 produce random effects with a smooth trend. By contrast, IID random effects produces ‘chaotic’ random effects due to their independence. Second, we can inspect model WAIC, where lower values implies better out-of-sample predictive performance.
m.iid$waic$waic[1] 3443.628
m.ar$waic$waic[1] 3428.846
m.rw$waic$waic[1] 3428.494
We see that AR1 and RW1 have lower WAIC signaling better predictive ability, as we could have expected.
We can also do a predictive check by plotting predicted trends against the observed one.
pred_iid <- tapply(m.iid$summary.fitted.values$mean, data$year, mean)
pred_ar1 <- tapply(m.ar$summary.fitted.values$mean, data$year, mean)
pred_rw1 <- tapply(m.rw$summary.fitted.values$mean, data$year, mean)
obs <- tapply(data$abundance, data$year, mean)
df <- data.frame(
year = sort(unique(data$year)),
obs = obs,
iid = pred_iid,
ar1 = pred_ar1,
rw1 = pred_rw1
)
df <- df |> pivot_longer(
cols = c("obs", "iid", "ar1", "rw1"),
names_to = "type",
values_to = "mean_abundance"
)
ggplot(df, aes(x = year, y = mean_abundance, color = type)) +
geom_line() +
labs(
y = "Mean abundance",
x = "Year"
)Here, we see that there is no strong differences: all three models track well the observed trend. The IID model might be a bit overfitting the trend at the beginning, but otherwise there is not much to say.
4.3. Modeling space
Now that we have seen the different option to model time dynamics, we will focus on space.
4.3.1. IID random effect
First, we begin with IID random effects, assuming site are independent from one another. This assumption is probably unrealistic but still provide a useful baseline.
We fit the model, extract spatial random effects and join them to the dataframe containing their geographical positions.
m.iid <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "ar1") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE)
)
re.site.iid <- m.iid$summary.random$pop_id
pts.re.iid <- pts_vilaine |> left_join(
re.site.iid |> rename(pop_id = "ID"),
by = "pop_id"
)Next, we can plot the random effect on a map, to check for spatial correlation.
ggplot(pts.re.iid) +
geom_sf(aes(color = mean), size = 2) +
scale_color_gradient2(
low = "blue",
mid = "white",
high = "red",
midpoint = 0
) +
labs(
color = "RE site",
)We do see a clear spatial correlation, from west to east going from negative to positive random effects. This spatial correlation in the residuals motivates the use of spatially structured residuals. We explore the different options to do so in the following subsections.
4.3.2. SPDE random effect
First step is to get site coordinates in meters, because angles distort distances between sites, which is going to bias our spatial model.
pts_vilaine.proj <- st_transform(pts_vilaine, 2154) # Coordinates in meters.
coords <- st_coordinates(pts_vilaine.proj)From these coordinates we can build the mesh using fmesher and visualise it.
library(fmesher)
mesh <- fm_mesh_2d_inla(loc = coords)
plot(mesh)
points(coords, col = "red", pch = 16, cex = 2)The mesh can be customised to increase or decrease its resolution using additional parameters. For our simple case study, we won’t focus on these details.
Next, we define the SPDE model and the projection matrix \(A\). \(A\) projects the spatial random field to actual observation locations.
spde <- inla.spde2.matern(mesh)
A <- inla.spde.make.A(mesh = mesh, loc = coords)
spatial_index <- inla.spde.make.index(
name = "spatial.field",
n.spde = spde$n.spde
)
dim(A)[1] 37 79
nrow(mesh$loc)[1] 79
We see that \(A\) is of dimension (number of sites x number of mesh knots).
Because we have multiple observations per sites we have to expand A so the spatial random field is projected for each observations (and not locations). To do so, we have to match each observation to its location.
site_idx <- match(data$pop_id, pts_vilaine$pop_id)
A_expanded <- A[site_idx, ]
dim(A_expanded)[1] 526 79
nrow(data)[1] 526
Next, we can build the data stack which combines: data, effects (i.e. predictors) and observation matrices.
# Prepare the data stack
data_stack <- inla.stack(
data = list(y = data$abundance),
A = list(A_expanded, 1),
effects = list(
spatial.field = spatial_index,
data.frame(
intercept = rep(1, nrow(data)),
altitude_s = data$altitude_s,
year = data$year,
pop_id = data$pop_id
)
),
tag = "est"
)
data_stackData stack with
data: (y), size: 526
effects: (spatial.field, spatial.field.group, spatial.field.repl, intercept, altitude_s, year, pop_id), size: 563
A: 526 times 563
Now everything is (finally) ready to fit the model.
m.spde <- inla(
y ~ -1 + intercept + altitude_s +
f(year, model = "ar1") +
f(spatial.field, model = spde),
family = "nbinomial",
data = inla.stack.data(data_stack),
control.predictor = list(A = inla.stack.A(data_stack)),
control.compute = list(config = TRUE, waic = TRUE)
)
summary(m.spde)Time used:
Pre = 0.754, Running = 0.498, Post = 0.105, Total = 1.36
Fixed effects:
mean sd 0.025quant 0.5quant 0.975quant mode kld
intercept 2.056 0.313 1.433 2.055 2.692 2.055 0
altitude_s -1.396 0.229 -1.848 -1.397 -0.941 -1.397 0
Random effects:
Name Model
year AR1 model
spatial.field SPDE2 model
Model hyperparameters:
mean sd 0.025quant
size for the nbinomial observations (1/overdispersion) 2.21 0.185 1.867
Precision for year 14.05 7.264 4.333
Rho for year 0.77 0.118 0.489
Theta1 for spatial.field 7.51 0.397 6.691
Theta2 for spatial.field -8.77 0.321 -9.373
0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion) 2.203 2.597
Precision for year 12.593 32.131
Rho for year 0.791 0.939
Theta1 for spatial.field 7.526 8.253
Theta2 for spatial.field -8.783 -8.111
mode
size for the nbinomial observations (1/overdispersion) 2.189
Precision for year 9.889
Rho for year 0.837
Theta1 for spatial.field 7.587
Theta2 for spatial.field -8.829
Watanabe-Akaike information criterion (WAIC) ...: 3431.84
Effective number of parameters .................: 45.48
Marginal log-Likelihood: -1778.47
is computed
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
First, let’s compare the WAIC of the SPDE model with the IID one.
m.iid$waic$waic[1] 3428.852
m.spde$waic$waic[1] 3431.841
We see that they perform similarly in terms of predictive performance. This inspection doesn’t indicate that a model is clearly better than another.
WAIC is a measure of out-of-sample predictive performance. A model can be wrong (in the sense of causal structure) and predict well, while a model can be right and predict poorly. It is important to keep this fact in mind while comparing modelling performance with WAIC.
To go further, we extract spatial random effects and want to visual them.
re.site.spde <- m.spde$summary.random$spatial.field
proj <- fm_evaluator(mesh)
# Get spatial field mean and sd
spatial_mean <- proj$proj$A %*% re.site.spde$mean
spatial_sd <- proj$proj$A %*% re.site.spde$sd
# Create data frame for ggplot
spatial_proj <- data.frame(
x = rep(proj$x, length(proj$y)),
y = rep(proj$y, each = length(proj$x)),
mean = as.vector(spatial_mean),
sd = as.vector(spatial_sd)
)We also extract the edges of the river network to plot it on the subsequent figure.
edges_sf <- net %>%
activate("edges") %>%
st_as_sf()
edges_sf <- st_transform(edges_sf, st_crs(pts_vilaine.proj))Visualize the spatial random effects on a map.
ggplot(spatial_proj, aes(x = x, y = y, fill = mean)) +
geom_raster() +
scale_fill_gradient2(
low = "blue",
mid = "white",
high = "red",
midpoint = 0
) +
geom_sf(
data = edges_sf,
inherit.aes = FALSE,
size = 0.3,
alpha = 0.3
) +
geom_sf(
data = pts_vilaine.proj,
aes(geometry = geometry),
inherit.aes = FALSE
) +
labs(
fill = "SPDE RE",
x = "",
y = ""
)As well as their uncertainty.
ggplot(spatial_proj, aes(x = x, y = y, fill = sd)) +
geom_raster() +
geom_sf(data = edges_sf, inherit.aes = FALSE, color = "black", size = 0.3) +
geom_sf(
data = pts_vilaine.proj,
aes(geometry = geometry),
inherit.aes = FALSE
) +
labs(
fill = "SD RE",
x = "",
y = ""
)As expected we see that uncertainty is lower near sites, and increases as we go further from them.
We see from plot above that SPDE makes sense for continuous process in space, as it can interpolate anywhere in the plane based on some assumed correlation structure. However, we know that eel do not live on land and therefore that their abundane is highly discontinuous. Some methods have adpated the SPDE framework to account for spatial barriers. Yet, this method doesn’t seem suited for river networks as we would have mostly barrier with a very complex topology. So let’s try something else.